Analyze Station Locations in Vancouver

In [1]:
import requests
import pandas as pd
import numpy as np
import zipfile

#creat map
import folium
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
In [2]:
# the GTFS zipfile is loaded into GTFS: a dictionary of dataframes
gtfs_zip = zipfile.ZipFile("ProjectData/google_transit.zip")
GTFS = {}
for table in gtfs_zip.namelist():
    table_name = table.split(".")[0]
    GTFS[table_name] = pd.read_csv(gtfs_zip.open(table))
print(GTFS.keys())
dict_keys(['agency', 'calendar', 'calendar_dates', 'feed_info', 'routes', 'shapes', 'stops', 'stop_times', 'transfers', 'trips', 'cardinal_directions_exceptions', 'direction_names_exceptions', 'stop_order_exceptions'])
In [3]:
# 8916 stops
# each skytrain platform has a parent_station
print(GTFS["stops"].loc[GTFS["stops"]["parent_station"]==99917])
print("---------------------------------------")
print(GTFS["stops"].loc[GTFS["stops"]["stop_id"]==99917])
      stop_id  stop_code                               stop_name  stop_desc  \
7854     8044    57969.0  Commercial-Broadway Station Platform 4        NaN   
7886     8073    57998.0  Commercial-Broadway Station Platform 3        NaN   
8183     8754    60822.0  Commercial-Broadway Station Platform 1        NaN   
8192     8763    60823.0  Commercial-Broadway Station Platform 2        NaN   

       stop_lat    stop_lon zone_id  stop_url  location_type  parent_station  
7854  49.262317 -123.069179    ZN 1       NaN              0         99917.0  
7886  49.262311 -123.069077    ZN 1       NaN              0         99917.0  
8183  49.262980 -123.068426    ZN 1       NaN              0         99917.0  
8192  49.262912 -123.068491    ZN 1       NaN              0         99917.0  
---------------------------------------
      stop_id  stop_code                    stop_name  stop_desc  stop_lat  \
8874    99917        NaN  Commercial-Broadway Station        NaN  49.26267   

        stop_lon zone_id  stop_url  location_type  parent_station  
8874 -123.068765    ZN 1       NaN              1             NaN  
In [4]:
# we are going to generate two station lists
# 1. existing rapid transit stations (parent_station): skytrain, seabus and west cost express
# 2. canadidate stations: exiting bus stops

RapidTransitStop_list = GTFS["stops"]["parent_station"].dropna().unique().tolist()
print("%d rapid transit stations"%len(RapidTransitStop_list))
RapidTransitStop_df = GTFS["stops"].loc[GTFS["stops"]["stop_id"].isin(RapidTransitStop_list)]
print(RapidTransitStop_df.head())
print(RapidTransitStop_df.shape)

#bus stops are rows where parent_station is NaN and stop_id not in the RapidTransitStop_list
BusStop_filter = ~GTFS["stops"]["stop_id"].isin(RapidTransitStop_list)
BusStop_filter = BusStop_filter & (GTFS["stops"]["parent_station"].isnull())
BusStop_df = GTFS["stops"].loc[BusStop_filter]
print("%d bus stops"%len(BusStop_df))
print(BusStop_df.head())
print(BusStop_df.shape)
59 rapid transit stations
      stop_id  stop_code                   stop_name  stop_desc   stop_lat  \
1466    12034        NaN          Waterfront Station        NaN  49.285687   
8858    99901        NaN         YVR-Airport Station        NaN  49.194174   
8859    99902        NaN   Sea Island Centre Station        NaN  49.192986   
8860    99903        NaN           Templeton Station        NaN  49.196688   
8861    99904        NaN  Richmond-Brighouse Station        NaN  49.167943   

        stop_lon zone_id  stop_url  location_type  parent_station  
1466 -123.111773    ZN 1       NaN              1             NaN  
8858 -123.178269    ZN 2       NaN              1             NaN  
8859 -123.157887    ZN 2       NaN              1             NaN  
8860 -123.146337    ZN 2       NaN              1             NaN  
8861 -123.136372    ZN 2       NaN              1             NaN  
(59, 10)
8726 bus stops
   stop_id  stop_code                          stop_name  stop_desc  \
0    10000    59326.0   Northbound No. 5 Rd @ McNeely Dr        NaN   
1    10001    59324.0  Northbound No. 5 Rd @ Woodhead Rd        NaN   
2    10002    59323.0    Southbound No. 5 Rd @ Cambie Rd        NaN   
3    10003    59325.0  Southbound No. 5 Rd @ Woodhead Rd        NaN   
4    10004    59327.0    Eastbound McNeely Dr @ No. 5 Rd        NaN   

    stop_lat    stop_lon zone_id  stop_url  location_type  parent_station  
0  49.179962 -123.091490  BUS ZN       NaN              0             NaN  
1  49.182670 -123.091448  BUS ZN       NaN              0             NaN  
2  49.184252 -123.091627  BUS ZN       NaN              0             NaN  
3  49.182051 -123.091659  BUS ZN       NaN              0             NaN  
4  49.179586 -123.091105  BUS ZN       NaN              0             NaN  
(8726, 10)
In [5]:
# show rapid transit station locations on map

# map centre lat,long
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
address = 'New Westminster, BC'
geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

# create map of rapid transit stations using latitude and longitude values
Station_Map = folium.Map(location=[latitude, longitude], zoom_start=11)

# add bus stop markers to map
for lat, lng, stopid, stopname in zip(BusStop_df['stop_lat'], BusStop_df['stop_lon'], BusStop_df['stop_id'], BusStop_df['stop_name']):
    label = '{}, {}'.format(stopid, stopname)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=1,
        popup=label,
        color='#9b99c4',
        fill=True,
        fill_color='#9b99c4',
        fill_opacity=0.1,
        parse_html=False).add_to(Station_Map)  

# add rapid transit markers to map
for lat, lng, stopid, stopname in zip(RapidTransitStop_df['stop_lat'], RapidTransitStop_df['stop_lon'], RapidTransitStop_df['stop_id'], RapidTransitStop_df['stop_name']):
    label = '{}, {}'.format(stopid, stopname)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.9,
        parse_html=False).add_to(Station_Map)  
    
Station_Map
Out[5]:
In [ ]:
## Foursquare API - Venue Data

#get nearby venue - utility function
CLIENT_ID = 'RBSGZRHE5H03FMTM43053AYPDNC2TJNJO0W50FMFWNVJSHQT' # your Foursquare ID
CLIENT_SECRET = 'FIDHWOGNIKYG1KYWVAGIIH3B2WRO32Y4OLQOZVOUOUBZK2NG' # your Foursquare Secret
VERSION = '20171101' # Foursquare API version
radius = 400 #m
LIMIT = 100 #no more than 100 venues per neighbourhood

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        try:
            response = requests.get(url).json()["response"]
            results = response['groups'][0]['items']
        except:
            continue # to the next record, no venue around
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['id'],
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['stop_id', 
                  'stop_lat', 
                  'stop_lon', 
                  'VenueID',
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)
In [ ]:
rapidtransit_venues = getNearbyVenues(names=RapidTransitStop_df['stop_id'],
                                     latitudes=RapidTransitStop_df['stop_lat'],
                                     longitudes=RapidTransitStop_df['stop_lon']
                                  )
rapidtransit_venues.to_csv("ProjectData/API_rapidtransit_venues.csv",index=False)
In [ ]:
# input - df: a Dataframe, chunkSize: the chunk size
# output - a list of DataFrame
# purpose - splits the DataFrame into smaller of max size chunkSize (last is smaller)
def splitDataFrameIntoSmaller(df, chunkSize = 100): 
    listOfDf = list()
    numberChunks = len(df) // chunkSize + 1
    for i in range(numberChunks):
        listOfDf.append(df[i*chunkSize:(i+1)*chunkSize])
    return listOfDf
In [ ]:
#split the bus stop dataframe into smaller chunks to query Foursquare API data 
BusStop_df_sections = splitDataFrameIntoSmaller(BusStop_df)
print("%d sections of (max) 100 stops"%len(BusStop_df_sections))
In [ ]:
# range 1 to 88 
for section in range(-1):
    rapidtransit_venues = getNearbyVenues(names=BusStop_df_sections[section]['stop_id'],
                                         latitudes=BusStop_df_sections[section]['stop_lat'],
                                         longitudes=BusStop_df_sections[section]['stop_lon']
                                      )
    rapidtransit_venues.to_csv("ProjectData/API_bus_venues_section%d.csv"%(section+1),index=False)
In [ ]:
# check Foursquare API limit
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    40.794123, 
    -73.953961, 
    500, 
    1)
requests.get(url).json()
# note: code 429 means you are over the daily call quota limit
# reset midnight UTC (5pm Vancouver Time)
In [6]:
# load all the venues from API calls
import os
directory = os.fsencode("ProjectData")
venue_df_list = []
for file in os.listdir(directory):
     filename = os.fsdecode(file)
     if "API" in filename and filename.endswith(".csv"): 
        venue_df_list.append(pd.read_csv("ProjectData/"+filename))
venue_df = pd.concat(venue_df_list, axis=0)

# check the venue categories
print('{} stops in the venue dataframe.'.format(len(venue_df['stop_id'].unique())))
print('There are {} uniques categories.'.format(len(venue_df['Venue Category'].unique())))
print('There are {} uniques venues.'.format(len(venue_df['VenueID'].unique())))
print(venue_df.head())
8340 stops in the venue dataframe.
There are 451 uniques categories.
There are 10861 uniques venues.
   stop_id   stop_lat    stop_lon                   VenueID  \
0     2660  49.209736 -122.994325  4bae7801f964a5203db63be3   
1     2660  49.209736 -122.994325  551c1d13498e6b30fa879f25   
2     2660  49.209736 -122.994325  4bd8948011dcc9280feef633   
3     2660  49.209736 -122.994325  4bf997345ec320a1bf848ad3   
4     2660  49.209736 -122.994325  4ba545dff964a5208df438e3   

                                  Venue  Venue Latitude  Venue Longitude  \
0                Gardenworks Mandeville       49.206704      -122.998270   
1  Riverway Golf Course & Driving Range       49.207437      -122.996536   
2                     Sun Tai Sang Farm       49.206914      -122.994924   
3                     Wing Wong Nursery       49.206530      -122.993784   
4                          Garden Works       49.206706      -122.998498   

   Venue Category  
0   Garden Center  
1     Golf Course  
2  Farmers Market  
3   Garden Center  
4   Garden Center  
In [7]:
# check the venues around a station (specify stop_id) on a map
stopid = 99931

latitude = GTFS["stops"].loc[GTFS["stops"]["stop_id"]==stopid]["stop_lat"].tolist()[0]
longitude = GTFS["stops"].loc[GTFS["stops"]["stop_id"]==stopid]["stop_lon"].tolist()[0]
stopname = GTFS["stops"].loc[GTFS["stops"]["stop_id"]==stopid]["stop_name"].tolist()[0]

station_venues = venue_df.loc[venue_df["stop_id"]==stopid]
print("%d venue(s) near %s"%(len(station_venues),stopname))

# create map of rapid transit stations using latitude and longitude values
StationVenue_Map = folium.Map(location=[latitude, longitude], tiles='cartodbpositron', zoom_start=16)

# add venue markers to map
for lat, lng, venuename, venuetype in zip(station_venues['Venue Latitude'], station_venues['Venue Longitude'], station_venues['Venue'], station_venues['Venue Category']):
    label = '{}, {}'.format(venuename, venuetype)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=4,
        popup=label,
        color='#d17389',
        fill=True,
        fill_color='#d17389',
        fill_opacity=0.9,
        parse_html=False).add_to(StationVenue_Map)  
    
# add the stop marker to map
label = '{}, {}'.format(stopid, stopname)
label = folium.Popup(label, parse_html=True)
folium.CircleMarker(
        [latitude, longitude],
        radius=10,
        popup=label,
        color='#8a1631',
        fill=True,
        fill_color='#8a1631',
        fill_opacity=0.9,
        parse_html=False).add_to(StationVenue_Map)  

StationVenue_Map
38 venue(s) near Lincoln Station
Out[7]:
In [51]:
#print(venue_df['Venue Category'].unique())
venue_type = venue_df[['Venue Category']].drop_duplicates()
venue_type.to_csv("VenueType.csv", index=False)
venue_type["Purpose"] = "Other"
venue_type_lookup = [# type label,  list of search keywords
                     ["Residential",   ["Neighborhood","Multiplex","Condo"]],
                     ["Restaurant",    ["Restaurant", "Bar", "Pizza","Sandwich",
                                        "Cafe","Café","Diner","Burger","Deli","Breakfast",
                                        "Food","Joint","Salad","Snack","Taco","Soup",
                                        "Fried Chicken","Bakery","Pub","Tea","Poke",
                                        "Brewery","Noodle","Burrito","Steakhouse","Speakeasy",
                                        "Buffet","Bistro","Creperie","Winery"]],
                     ["Shop",          ["Store","Shop","Supermarket","Gastropub","Mall","Bookstore",
                                        "Market","Butcher","Boutique","Dealership","Grocery","Supply"]],
                     ["Service",       ["Service","Repair","Rental","Locksmith","Storage","Laundromat",
                                        "Workshop","Cleaner","Photography","Cemetery","Astrologer"]],
                     ["GreenSpace",    ["Trail","Public Art","Park","Track","Garden","Dog",
                                        "Lake","Farm","Beach","Lookout","Nature","Mountain","Tree",
                                        "River","Forest","Waterfall","Outdoor","Stables","Bay"]],
                     ["Bank",          ["Bank","ATM","Credit"]],
                     ["HealthCare",    ["Chiropractor","Physical Therapist","Pharmacy",
                                        "Message","Dispensary","Spa","Massage","Health","Medical"]],
                     ["Hotel",         ["Hotel","Motel","Inn","Hostel","Salon"]],
                     ["School",        ["School","Library","University","College"]],
                     ["Office",        ["Office","Tech Startup","Lawyer","Notary","Legal","IT",
                                        "City Hall","Landscaping","Building"]],
                     ["Recreation",    ["Gym","Yoga","Dance","Golf","Laser Tag","Entertainment",
                                        "Rock Club","Photography Studio","Stadium","Bowling Alley",
                                        "Club","Pool","Theater","Skating","Sports","Soccer",
                                        "Nightclub","Casino","Studio","Hockey","Court","Venue",
                                        "Circus","Field","Lounge","Curling","Hall","Centre","Martial",
                                        "Recreation","Centre","Center","Plaza","Playground","Waterfront",
                                        "Arcade","Planetarium","Nightlife","Play","Rock Climbing",
                                        "Amphitheater","Surf","Roof","Racetrack","Moving Target"]],
                     ["Attraction",    ["Museum","Gallery","Fair","Landmark","Resort",
                                        "Aquarium","Site","Ski","Campground","Church"]],
                     ["Transportation",["Bus","Station","Airport","Travel Lounge","Heliport",
                                        "Transportation","Cruise","Transport","Border","Car",
                                        "Harbor","Travel","Pier","Ferry","Road","Bridge","Plane",
                                        "Platform","Garage","Train","Intersection","Rest Area"]]]
for label, keywords in venue_type_lookup:
    for keyword in keywords:
        venue_type["Purpose"] = np.where(venue_type["Venue Category"].str.contains(keyword),label,venue_type["Purpose"])
venue_type[venue_type["Purpose"]=="Other"]["Venue Category"].tolist()
Out[51]:
[]
In [52]:
venue_type_count = venue_df[['VenueID', 'Venue Category']].drop_duplicates()
venue_type_count = venue_type_count.groupby(["Venue Category"]).count().reset_index().sort_values(by=["VenueID"],ascending=False)
venue_type_count = pd.merge(venue_type_count,venue_type,on="Venue Category")
print(venue_type_count.head(30))
                Venue Category  VenueID         Purpose
0                  Coffee Shop      590            Shop
1                         Park      415      GreenSpace
2                     Bus Stop      331  Transportation
3                  Pizza Place      269      Restaurant
4               Sandwich Place      256      Restaurant
5           Chinese Restaurant      252      Restaurant
6             Sushi Restaurant      251      Restaurant
7                Grocery Store      235            Shop
8         Fast Food Restaurant      217      Restaurant
9                         Café      215      Restaurant
10  Construction & Landscaping      186          Office
11         Japanese Restaurant      186      Restaurant
12                  Restaurant      170      Restaurant
13       Vietnamese Restaurant      166      Restaurant
14           Convenience Store      156            Shop
15                        Bank      148            Bank
16                      Bakery      148      Restaurant
17                    Pharmacy      135      HealthCare
18      Furniture / Home Store      133            Shop
19                         Gym      127      Recreation
20                Liquor Store      119            Shop
21                         Pub      116      Restaurant
22                 Bus Station      113  Transportation
23                       Hotel      112           Hotel
24              Clothing Store      110            Shop
25           Indian Restaurant      108      Restaurant
26              Ice Cream Shop      106            Shop
27                Burger Joint      103      Restaurant
28                       Trail      100      GreenSpace
29                Home Service      100         Service
In [ ]:
 
In [55]:
# show all venue locations on a map

# map centre lat,long
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
address = 'New Westminster, BC'
geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

station_venues = venue_df[['VenueID','Venue Latitude','Venue Longitude','Venue','Venue Category']].drop_duplicates()

# create map of rapid transit stations using latitude and longitude values
StationVenue_Map = folium.Map(location=[latitude, longitude], tiles='cartodbpositron', zoom_start=11)

#add venue markers to map
for lat, lng, venuename, venuetype in zip(station_venues['Venue Latitude'], station_venues['Venue Longitude'], station_venues['Venue'], station_venues['Venue Category']):
    label = '{}, {}'.format(venuename, venuetype)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=1,
        popup=label,
        color='#d17389',
        fill=True,
        fill_color='#d17389',
        fill_opacity=0.2,
        parse_html=False).add_to(StationVenue_Map)  

print("All venues")
StationVenue_Map

Run K-mean clustering on Rapid Transit Stops

In [57]:
venue_df = pd.read_csv("ProjectData/API_rapidtransit_venues.csv")
venue_df = pd.merge(venue_df, venue_type, on="Venue Category") # use major purpose
venue_df["Venue Category"] = venue_df["Purpose"]               # use major purpose

# check the venue categories
print('{} stops in the venue dataframe.'.format(len(venue_df['stop_id'].unique())))
print('There are {} uniques categories.'.format(len(venue_df['Venue Category'].unique())))
print('There are {} uniques venues.'.format(len(venue_df['VenueID'].unique())))
print(venue_df.head())
59 stops in the venue dataframe.
There are 13 uniques categories.
There are 1523 uniques venues.
   stop_id   stop_lat    stop_lon                   VenueID  \
0    12034  49.285687 -123.111773  4aa7f561f964a520384e20e3   
1    99904  49.167943 -123.136372  54efa903498e6540d8b063b1   
2    99904  49.167943 -123.136372  5092d3b0e4b0c441550a2497   
3    99904  49.167943 -123.136372  4b3fd96df964a5206cb025e3   
4    99904  49.167943 -123.136372  4bbf9510920eb713ae96172c   

                             Venue  Venue Latitude  Venue Longitude  \
0                  Miku Restaurant       49.286713      -123.112044   
1              Pepper Lunch Canada       49.170897      -123.136569   
2  Umi: Japanese Eatery & Udon Bar       49.167238      -123.138848   
3        Ebisu Japanese Restaurant       49.172373      -123.136200   
4                     Banzai Sushi       49.170651      -123.133057   

  Venue Category     Purpose  
0     Restaurant  Restaurant  
1     Restaurant  Restaurant  
2     Restaurant  Restaurant  
3     Restaurant  Restaurant  
4     Restaurant  Restaurant  
In [58]:
#one hot encoding
stop_onehot = pd.get_dummies(venue_df[['Venue Category']], prefix="", prefix_sep="")
onehot_list = list(stop_onehot.columns)
stop_onehot['stop_id'] = venue_df['stop_id']

#make sure Neighborhood is the first column
fix_columns = ['stop_id'] + onehot_list
stop_onehot = stop_onehot[fix_columns]

stop_onehot.head()
Out[58]:
stop_id Attraction Bank GreenSpace HealthCare Hotel Office Recreation Residential Restaurant School Service Shop Transportation
0 12034 0 0 0 0 0 0 0 0 1 0 0 0 0
1 99904 0 0 0 0 0 0 0 0 1 0 0 0 0
2 99904 0 0 0 0 0 0 0 0 1 0 0 0 0
3 99904 0 0 0 0 0 0 0 0 1 0 0 0 0
4 99904 0 0 0 0 0 0 0 0 1 0 0 0 0
In [59]:
#for each stop, get the mean of the frequency of occurance of each category
stop_group = stop_onehot.groupby('stop_id').mean().reset_index()
stop_group.head()
Out[59]:
stop_id Attraction Bank GreenSpace HealthCare Hotel Office Recreation Residential Restaurant School Service Shop Transportation
0 12034 0.0 0.000000 0.010000 0.020000 0.090000 0.04 0.050000 0.0 0.490000 0.0 0.000000 0.230000 0.070000
1 99901 0.0 0.020408 0.061224 0.000000 0.040816 0.00 0.000000 0.0 0.204082 0.0 0.000000 0.285714 0.387755
2 99902 0.0 0.000000 0.000000 0.000000 0.000000 0.00 0.000000 0.0 0.200000 0.0 0.000000 0.000000 0.800000
3 99903 0.0 0.000000 0.071429 0.000000 0.000000 0.00 0.000000 0.0 0.250000 0.0 0.000000 0.642857 0.035714
4 99904 0.0 0.032967 0.000000 0.021978 0.010989 0.00 0.010989 0.0 0.582418 0.0 0.010989 0.329670 0.000000
In [60]:
#utility function for top x venues
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]
In [61]:
import numpy as np
#print the top 10 most common venues near each stop

num_top_venues = 10
indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['stop_id']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['stop_id'] = stop_group['stop_id']

for ind in np.arange(stop_group.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(stop_group.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()
Out[61]:
stop_id 1st Most Common Venue 2nd Most Common Venue 3rd Most Common Venue 4th Most Common Venue 5th Most Common Venue 6th Most Common Venue 7th Most Common Venue 8th Most Common Venue 9th Most Common Venue 10th Most Common Venue
0 12034 Restaurant Shop Hotel Transportation Recreation Office HealthCare GreenSpace Service School
1 99901 Transportation Shop Restaurant GreenSpace Hotel Bank Service School Residential Recreation
2 99902 Transportation Restaurant Shop Service School Residential Recreation Office Hotel HealthCare
3 99903 Shop Restaurant GreenSpace Transportation Service School Residential Recreation Office Hotel
4 99904 Restaurant Shop Bank HealthCare Service Recreation Hotel Transportation School Residential
In [62]:
#Cluster Neighbourhoods

# import k-means from clustering stage
from sklearn.cluster import KMeans

group_clustering = stop_group.drop('stop_id', 1)
In [63]:
Sum_of_squared_distances = []
K = range(1,15)
for k in K:
    km = KMeans(n_clusters=k)
    km = km.fit(group_clustering)
    Sum_of_squared_distances.append(km.inertia_)
In [64]:
import matplotlib.pyplot as plt
plt.plot(K, Sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')
plt.show()
In [65]:
# set number of clusters
kclusters = 5

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(group_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 
Out[65]:
array([1, 4, 0, 2, 1, 1, 1, 4, 4, 3], dtype=int32)
In [66]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

cluster_merge = RapidTransitStop_df

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
cluster_merge = cluster_merge.join(neighborhoods_venues_sorted.set_index('stop_id'), on='stop_id')

cluster_merge.head() # check the last columns!
Out[66]:
stop_id stop_code stop_name stop_desc stop_lat stop_lon zone_id stop_url location_type parent_station ... 1st Most Common Venue 2nd Most Common Venue 3rd Most Common Venue 4th Most Common Venue 5th Most Common Venue 6th Most Common Venue 7th Most Common Venue 8th Most Common Venue 9th Most Common Venue 10th Most Common Venue
1466 12034 NaN Waterfront Station NaN 49.285687 -123.111773 ZN 1 NaN 1 NaN ... Restaurant Shop Hotel Transportation Recreation Office HealthCare GreenSpace Service School
8858 99901 NaN YVR-Airport Station NaN 49.194174 -123.178269 ZN 2 NaN 1 NaN ... Transportation Shop Restaurant GreenSpace Hotel Bank Service School Residential Recreation
8859 99902 NaN Sea Island Centre Station NaN 49.192986 -123.157887 ZN 2 NaN 1 NaN ... Transportation Restaurant Shop Service School Residential Recreation Office Hotel HealthCare
8860 99903 NaN Templeton Station NaN 49.196688 -123.146337 ZN 2 NaN 1 NaN ... Shop Restaurant GreenSpace Transportation Service School Residential Recreation Office Hotel
8861 99904 NaN Richmond-Brighouse Station NaN 49.167943 -123.136372 ZN 2 NaN 1 NaN ... Restaurant Shop Bank HealthCare Service Recreation Hotel Transportation School Residential

5 rows × 21 columns

In [67]:
#creat map
import folium
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# map centre lat,long
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
address = 'New Westminster, BC'
geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude


# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(cluster_merge['stop_lat'], cluster_merge['stop_lon'], cluster_merge['stop_name'], cluster_merge['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters
Out[67]:
In [76]:
cluster_merge.loc[cluster_merge['Cluster Labels'] == 0, cluster_merge.columns[[1] + list(range(2, cluster_merge.shape[1]))]]
Out[76]:
stop_code stop_name stop_desc stop_lat stop_lon zone_id stop_url location_type parent_station Cluster Labels 1st Most Common Venue 2nd Most Common Venue 3rd Most Common Venue 4th Most Common Venue 5th Most Common Venue 6th Most Common Venue 7th Most Common Venue 8th Most Common Venue 9th Most Common Venue 10th Most Common Venue
8859 NaN Sea Island Centre Station NaN 49.192986 -123.157887 ZN 2 NaN 1 NaN 0 Transportation Restaurant Shop Service School Residential Recreation Office Hotel HealthCare
8880 NaN Sperling-Burnaby Lake Station NaN 49.259195 -122.963998 ZN 2 NaN 1 NaN 0 Transportation Shop Service School Restaurant Residential Recreation Office Hotel HealthCare
8894 NaN Nanaimo Station NaN 49.248321 -123.055908 ZN 1 NaN 1 NaN 0 Transportation GreenSpace Shop Service School Restaurant Residential Recreation Office Hotel
8901 NaN 22nd Street Station NaN 49.200059 -122.948980 ZN 2 NaN 1 NaN 0 Transportation Restaurant Shop Service School Residential Recreation Office Hotel HealthCare
8905 NaN Braid Station NaN 49.233244 -122.882800 ZN 2 NaN 1 NaN 0 Transportation Service Recreation Shop School Restaurant Residential Office Hotel HealthCare
8913 NaN Port Haney Station NaN 49.212202 -122.605240 WCE3Z NaN 1 NaN 0 Transportation Restaurant Shop Service School Residential Recreation Office Hotel HealthCare
In [82]:
cluster_merge.loc[cluster_merge['Cluster Labels'] == 1, cluster_merge.columns[[1] + list(range(2, cluster_merge.shape[1]))]]
Out[82]:
stop_code stop_name stop_desc stop_lat stop_lon zone_id stop_url location_type parent_station Cluster Labels 1st Most Common Venue 2nd Most Common Venue 3rd Most Common Venue 4th Most Common Venue 5th Most Common Venue 6th Most Common Venue 7th Most Common Venue 8th Most Common Venue 9th Most Common Venue 10th Most Common Venue
1466 NaN Waterfront Station NaN 49.285687 -123.111773 ZN 1 NaN 1 NaN 1 Restaurant Shop Hotel Transportation Recreation Office HealthCare GreenSpace Service School
8861 NaN Richmond-Brighouse Station NaN 49.167943 -123.136372 ZN 2 NaN 1 NaN 1 Restaurant Shop Bank HealthCare Service Recreation Hotel Transportation School Residential
8862 NaN Lansdowne Station NaN 49.174665 -123.136475 ZN 2 NaN 1 NaN 1 Restaurant Shop Recreation Bank Hotel HealthCare Service Office Transportation School
8863 NaN Aberdeen Station NaN 49.184000 -123.136325 ZN 2 NaN 1 NaN 1 Restaurant Shop Recreation Transportation Hotel HealthCare Service School Residential Office
8869 NaN Broadway-City Hall Station NaN 49.262717 -123.114822 ZN 1 NaN 1 NaN 1 Restaurant Shop Recreation HealthCare GreenSpace Office Bank Attraction Transportation Service
8870 NaN Olympic Village Station NaN 49.266596 -123.115348 ZN 1 NaN 1 NaN 1 Restaurant Shop Attraction Recreation HealthCare Transportation Service School Residential Office
8871 NaN Yaletown-Roundhouse Station NaN 49.274492 -123.122079 ZN 1 NaN 1 NaN 1 Restaurant Shop Recreation Hotel GreenSpace Transportation HealthCare Bank Service School
8872 NaN Vancouver City Centre Station NaN 49.282415 -123.118227 ZN 1 NaN 1 NaN 1 Restaurant Shop Recreation Hotel Office HealthCare GreenSpace Attraction Transportation Service
8874 NaN Commercial-Broadway Station NaN 49.262670 -123.068765 ZN 1 NaN 1 NaN 1 Restaurant Shop Recreation Transportation Service School Residential Office Hotel HealthCare
8875 NaN Renfrew Station NaN 49.258913 -123.045379 ZN 1 NaN 1 NaN 1 Restaurant Shop Office Transportation Service School Residential Recreation Hotel HealthCare
8877 NaN Gilmore Station NaN 49.264974 -123.013554 ZN 2 NaN 1 NaN 1 Restaurant Shop Recreation Office Transportation Service School Residential Hotel HealthCare
8879 NaN Holdom Station NaN 49.264734 -122.982169 ZN 2 NaN 1 NaN 1 Restaurant Transportation Shop Recreation Service Hotel HealthCare School Residential Office
8882 NaN Production Way-University Station NaN 49.253416 -122.918223 ZN 2 NaN 1 NaN 1 Restaurant Shop GreenSpace Transportation Service School Residential Recreation Office Hotel
8883 NaN Lougheed Town Centre Station NaN 49.248512 -122.896805 ZN 2 NaN 1 NaN 1 Restaurant Shop Recreation Hotel HealthCare GreenSpace Bank Transportation Service School
8885 NaN Moody Centre Station NaN 49.278012 -122.846034 ZN 3 NaN 1 NaN 1 Restaurant Shop Transportation Hotel HealthCare GreenSpace Attraction Service School Residential
8891 NaN Granville Station NaN 49.283173 -123.115721 ZN 1 NaN 1 NaN 1 Restaurant Shop Hotel Recreation Office Transportation GreenSpace Attraction Service School
8892 NaN Stadium-Chinatown Station NaN 49.279223 -123.109181 ZN 1 NaN 1 NaN 1 Restaurant Recreation Shop School Hotel Attraction Transportation Service Residential Office
8893 NaN Main Street-Science World Station NaN 49.273137 -123.100392 ZN 1 NaN 1 NaN 1 Restaurant Shop Recreation GreenSpace Attraction Transportation Service School Residential Office
8896 NaN Joyce-Collingwood Station NaN 49.238398 -123.031811 ZN 1 NaN 1 NaN 1 Restaurant Transportation Shop GreenSpace Service School Residential Recreation Office Hotel
8897 NaN Patterson Station NaN 49.229626 -123.012670 ZN 2 NaN 1 NaN 1 Restaurant Recreation Shop GreenSpace Bank Transportation Service School Residential Office
8899 NaN Royal Oak Station NaN 49.220068 -122.988480 ZN 2 NaN 1 NaN 1 Restaurant Shop Transportation Recreation Service School Residential Office Hotel HealthCare
8902 NaN New Westminster Station NaN 49.201437 -122.912614 ZN 2 NaN 1 NaN 1 Restaurant Shop Recreation Bank Service Residential Office Hotel HealthCare GreenSpace
8903 NaN Columbia Station NaN 49.204830 -122.906175 ZN 2 NaN 1 NaN 1 Restaurant Shop Recreation GreenSpace HealthCare Attraction Transportation Service School Residential
8908 NaN Surrey Central Station NaN 49.189562 -122.847868 ZN 3 NaN 1 NaN 1 Restaurant Shop Recreation Transportation Bank School Office HealthCare Service Residential
8909 NaN King George Station NaN 49.182790 -122.844737 ZN 3 NaN 1 NaN 1 Restaurant Shop Recreation Office GreenSpace Transportation Service School Residential Hotel
8910 NaN Port Coquitlam Station NaN 49.261499 -122.773999 WCE2Z NaN 1 NaN 1 Restaurant Shop Transportation Recreation GreenSpace Service School Residential Office Hotel
8911 NaN Pitt Meadows Station NaN 49.225796 -122.688379 WCE3Z NaN 1 NaN 1 Restaurant Shop Recreation GreenSpace Transportation Hotel Service School Residential Office
8915 NaN Lonsdale Quay Station NaN 49.310142 -123.083309 ZN 2 NaN 1 NaN 1 Restaurant Shop Recreation Hotel GreenSpace Transportation HealthCare Service School Residential
In [78]:
cluster_merge.loc[cluster_merge['Cluster Labels'] == 2, cluster_merge.columns[[1] + list(range(2, cluster_merge.shape[1]))]]
Out[78]:
stop_code stop_name stop_desc stop_lat stop_lon zone_id stop_url location_type parent_station Cluster Labels 1st Most Common Venue 2nd Most Common Venue 3rd Most Common Venue 4th Most Common Venue 5th Most Common Venue 6th Most Common Venue 7th Most Common Venue 8th Most Common Venue 9th Most Common Venue 10th Most Common Venue
8860 NaN Templeton Station NaN 49.196688 -123.146337 ZN 2 NaN 1 NaN 2 Shop Restaurant GreenSpace Transportation Service School Residential Recreation Office Hotel
8867 NaN Oakridge-41st Avenue Station NaN 49.233739 -123.116295 ZN 1 NaN 1 NaN 2 Shop Restaurant Recreation Hotel GreenSpace Bank Transportation Service School Residential
8873 NaN VCC-Clark Station NaN 49.265845 -123.079069 ZN 1 NaN 1 NaN 2 Shop Transportation Restaurant GreenSpace Attraction Service School Residential Recreation Office
8876 NaN Rupert Station NaN 49.260784 -123.032867 ZN 1 NaN 1 NaN 2 Shop Restaurant Transportation Service Office School Residential Recreation Hotel HealthCare
8878 NaN Brentwood Town Centre Station NaN 49.266396 -123.001829 ZN 2 NaN 1 NaN 2 Shop Restaurant HealthCare Transportation Bank Service School Residential Recreation Office
8884 NaN Burquitlam Station NaN 49.261403 -122.889834 ZN 3 NaN 1 NaN 2 Shop Restaurant Transportation HealthCare Service School Residential Recreation Office Hotel
8886 NaN Inlet Centre Station NaN 49.277210 -122.828187 ZN 3 NaN 1 NaN 2 Shop Restaurant Recreation Bank Transportation Service School Residential Office Hotel
8887 NaN Coquitlam Central Station NaN 49.274340 -122.800421 ZN 3 NaN 1 NaN 2 Restaurant Shop Transportation HealthCare Bank Service School Residential Recreation Office
8888 NaN Lincoln Station NaN 49.280423 -122.793917 ZN 3 NaN 1 NaN 2 Shop Restaurant HealthCare Recreation Transportation Service School Residential Office Hotel
8890 NaN Burrard Station NaN 49.285614 -123.119557 ZN 1 NaN 1 NaN 2 Shop Restaurant Hotel Recreation Office HealthCare Attraction Transportation Service School
8898 NaN Metrotown Station NaN 49.225825 -123.003920 ZN 2 NaN 1 NaN 2 Restaurant Shop Recreation Hotel Transportation Service School Residential Office HealthCare
8904 NaN Sapperton Station NaN 49.224662 -122.889393 ZN 2 NaN 1 NaN 2 Shop Restaurant Transportation Recreation HealthCare GreenSpace Bank Service School Residential
In [81]:
cluster_merge.loc[cluster_merge['Cluster Labels'] == 3, cluster_merge.columns[[1] + list(range(2, cluster_merge.shape[1]))]]
Out[81]:
stop_code stop_name stop_desc stop_lat stop_lon zone_id stop_url location_type parent_station Cluster Labels 1st Most Common Venue 2nd Most Common Venue 3rd Most Common Venue 4th Most Common Venue 5th Most Common Venue 6th Most Common Venue 7th Most Common Venue 8th Most Common Venue 9th Most Common Venue 10th Most Common Venue
8866 NaN Langara-49th Avenue Station NaN 49.225730 -123.116523 ZN 1 NaN 1 NaN 3 Transportation GreenSpace Recreation Shop Service School Restaurant Residential Office Hotel
8881 NaN Lake City Way Station NaN 49.254635 -122.939192 ZN 2 NaN 1 NaN 3 Transportation Shop GreenSpace Service School Restaurant Residential Recreation Office Hotel
8895 NaN 29th Avenue Station NaN 49.244283 -123.046113 ZN 1 NaN 1 NaN 3 Transportation GreenSpace Shop Restaurant Recreation Service School Residential Office Hotel
8900 NaN Edmonds Station NaN 49.211936 -122.959104 ZN 2 NaN 1 NaN 3 GreenSpace Transportation Shop Restaurant Recreation Service School Residential Office Hotel
In [79]:
cluster_merge.loc[cluster_merge['Cluster Labels'] == 4, cluster_merge.columns[[1] + list(range(2, cluster_merge.shape[1]))]]
Out[79]:
stop_code stop_name stop_desc stop_lat stop_lon zone_id stop_url location_type parent_station Cluster Labels 1st Most Common Venue 2nd Most Common Venue 3rd Most Common Venue 4th Most Common Venue 5th Most Common Venue 6th Most Common Venue 7th Most Common Venue 8th Most Common Venue 9th Most Common Venue 10th Most Common Venue
8858 NaN YVR-Airport Station NaN 49.194174 -123.178269 ZN 2 NaN 1 NaN 4 Transportation Shop Restaurant GreenSpace Hotel Bank Service School Residential Recreation
8864 NaN Bridgeport Station NaN 49.195538 -123.126062 ZN 2 NaN 1 NaN 4 Transportation Restaurant Recreation Hotel Shop Service School Residential Office HealthCare
8865 NaN Marine Drive Station NaN 49.209666 -123.117015 ZN 1 NaN 1 NaN 4 Restaurant Shop Transportation Bank Recreation HealthCare Service School Residential Office
8868 NaN King Edward Station NaN 49.249175 -123.115450 ZN 1 NaN 1 NaN 4 Transportation Shop Restaurant GreenSpace Service School Residential Recreation Office Hotel
8889 NaN Lafarge Lake-Douglas Station NaN 49.285718 -122.791541 ZN 3 NaN 1 NaN 4 Shop Restaurant Recreation GreenSpace Transportation Attraction Service School Residential Office
8906 NaN Scott Road Station NaN 49.204420 -122.874197 ZN 3 NaN 1 NaN 4 Transportation Shop Restaurant Recreation Service School Residential Office Hotel HealthCare
8907 NaN Gateway Station NaN 49.198955 -122.850610 ZN 3 NaN 1 NaN 4 Restaurant Shop Transportation Recreation Service HealthCare School Residential Office Hotel
8912 NaN Maple Meadows Station NaN 49.216513 -122.666210 WCE3Z NaN 1 NaN 4 Transportation Shop Restaurant Recreation Service School Residential Office Hotel HealthCare
8914 NaN Mission City Station NaN 49.133694 -122.304898 WCE4Z NaN 1 NaN 4 Transportation Shop Restaurant Recreation Bank Service School Residential Office Hotel

Rapid Transit Station Clusters:

  1. Transit Exchange
  2. Restaurant + shop + recreation
  3. Shopping
  4. Transportation Hub + Park/Green Space
  5. Transportation Hub + Shop + Restaurants
In [ ]:
 
In [ ]: